library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
green_url <- "https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2020-02.csv"
trip_df <- read_csv(green_url)
## Rows: 398632 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): store_and_fwd_flag
## dbl (16): VendorID, RatecodeID, PULocationID, DOLocationID, passenger_count...
## lgl (1): ehail_fee
## dttm (2): lpep_pickup_datetime, lpep_dropoff_datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
There are about 20% missing values in vendorID, trip_type, store_and_fwd_flag, retacodeID, payment_type, passenger_count, and congestion_surcharge. There’s no value in Ehail_fee column, hence drop it.
trip_df_missing_freq <- sapply(trip_df, function(x) {sum(is.na(x))/nrow(trip_df)})
tibble( names = names(trip_df_missing_freq), missing_freq = trip_df_missing_freq) %>%
ggplot() +
geom_bar( mapping = aes(x = names, y = missing_freq), stat = 'identity') +
coord_flip()
In total 19 columns and 1 is categorical and 2 are in date format.
glimpse(trip_df)
## Rows: 398,632
## Columns: 20
## $ VendorID <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, …
## $ lpep_pickup_datetime <dttm> 2020-02-01 00:10:25, 2020-02-01 00:16:59, 2020-…
## $ lpep_dropoff_datetime <dttm> 2020-02-01 00:14:34, 2020-02-01 00:21:35, 2020-…
## $ store_and_fwd_flag <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N"…
## $ RatecodeID <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, …
## $ PULocationID <dbl> 74, 74, 223, 145, 166, 166, 7, 112, 212, 212, 7,…
## $ DOLocationID <dbl> 41, 74, 7, 145, 166, 41, 173, 80, 213, 58, 179, …
## $ passenger_count <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, …
## $ trip_distance <dbl> 0.76, 0.72, 0.89, 1.12, 0.65, 1.11, 2.89, 0.87, …
## $ fare_amount <dbl> 4.5, 5.0, 6.0, 6.0, 4.0, 6.0, 14.0, 6.0, 10.0, 1…
## $ extra <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5…
## $ mta_tax <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5…
## $ tip_amount <dbl> 0.00, 0.00, 1.82, 0.00, 1.06, 0.00, 0.00, 0.00, …
## $ tolls_amount <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, …
## $ ehail_fee <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ improvement_surcharge <dbl> 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3…
## $ total_amount <dbl> 5.80, 6.30, 9.12, 7.30, 6.36, 7.30, 15.30, 7.30,…
## $ payment_type <dbl> 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, …
## $ trip_type <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ congestion_surcharge <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, …
Examine if there are abnormal values in the “RateCodeID” column, we can see there is code 99 in the column, but should be only 1 to 6 in the data dictionary. Therefore any values with code 99 should be removed or defined.
trip_df %>%
pull( RatecodeID) %>%
unique()
## [1] 1 4 5 3 2 6 99 NA
Number of outliers in each column is shown in the figure below. Tolls amount, total amount, tip amount, extra, and fare amount are continuous variables, which means other variables would not include any outlier in them.
For more detail, tolls amount contains 379978 zeros and 18654 non zeros. Therefore, these shouldn’t be considered as outliers if we want to remove them in the future.
n_outliers <- function(x){
sum(abs((x[!is.na(x)] - mean(x, na.rm = TRUE)) / sd(x , na.rm = TRUE)) > 3)
}
enframe(sapply(trip_df[, 5:19], n_outliers)) %>%
ggplot() +
geom_bar( mapping = aes(x=name, y = value ), stat='identity') +
coord_flip() +
labs( x = "Number of outliers", y = "Variables", title = "Number of outliers of each variable.")
For total amount, there are many negative amounts which means return for passengers. There are 1152 negative values.
trip_df %>%
filter( total_amount < 0) %>%
pull(total_amount) %>%
length()
## [1] 1152
The histogram plot shows only one visible bar with more than 30k data, therefore the data is highly skewed. This means there are several very long trip distance in this dataset. Take logarithm 10 to trip_distance for a better visualization, it looks like the normal distribution, but the x axis is actually based on log10. I also list first 20 longest trip distance in this dataset, 10 trip distances longer than 10k.
trip_df %>%
select( trip_distance) %>%
drop_na() %>%
ggplot() +
geom_histogram( mapping = aes( trip_distance))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Now plot distribution excluding outliers (defined as z score > 30). We can see here, the distribution is skewed to the left.
trip_df %>%
mutate( z = (trip_distance - mean(trip_distance) / sd(trip_distance))) %>%
filter( abs(z) < 30) %>%
select( trip_distance) %>%
ggplot() +
geom_histogram( mapping = aes( trip_distance))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Distribution of log plot looks like a normal distrubution. Therefore trip value could be logarized before data processing.
trip_df %>%
select( trip_distance) %>%
drop_na() %>%
ggplot() +
geom_histogram( mapping = aes( log10(trip_distance)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12993 rows containing non-finite values (stat_bin).
Here, I propose that the hour in a day and a day in a week could impact the amount of tips. Maybe people would pay higher tips during the weekends and midnights.
trip_df_clean_time <- trip_df %>%
mutate_at( vars( VendorID, RatecodeID, payment_type, trip_type), as.factor) %>%
mutate( pick_up_month = format( lpep_pickup_datetime, "%Y-%b")) %>%
mutate( drop_off_month = format( lpep_dropoff_datetime, "%Y-%b")) %>%
filter( !( drop_off_month != "2020-Feb" & pick_up_month != "2020-Feb")) %>%
mutate( dayofweek = format(lpep_pickup_datetime, "%w")) %>%
mutate( hourofday = as.numeric(format(lpep_pickup_datetime, "%H"))) %>%
select( dayofweek, hourofday, tip_amount) %>%
type_convert()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## dayofweek = col_double()
## )
cor(trip_df_clean_time)
## dayofweek hourofday tip_amount
## dayofweek 1.00000000 0.05313202 0.01108371
## hourofday 0.05313202 1.00000000 0.03335513
## tip_amount 0.01108371 0.03335513 1.00000000
Answer is no. There’s very weak correlation between hour in a day and day in a week with the amount of tips.
trip_df_clean_time %>%
group_by(dayofweek) %>%
summarise( average_tips = mean(tip_amount)) %>%
ggplot( ) +
geom_bar( mapping = aes(x = dayofweek, y = average_tips), stat = "identity") +
labs( title = "average tips vs. day of a week")
From figure above we can see tips amount is nearly consistent throughout a week. Therefore, this feature shouldn’t be used further.
trip_df_clean_time %>%
group_by(hourofday) %>%
summarise( average_tips = mean(tip_amount)) %>%
ggplot( ) +
geom_bar( mapping = aes(x = hourofday, y = average_tips), stat = "identity") +
labs( title = "average tips vs. hour of a day")
From the figure above, we can see that there’s a pattern between hour of a day and tip amount. How tips relates to trip distance in a day?
trip_df %>%
mutate( lpep_pickup_hour = format(lpep_pickup_datetime, "%H")) %>%
group_by( lpep_pickup_hour) %>%
summarise( median_trip_distance = median(trip_distance)) %>%
ggplot() +
geom_bar( mapping = aes( x = lpep_pickup_hour, y = median_trip_distance)
, fill ='white', color = 'black', stat = 'identity') +
labs( x = "Hour in a day", y = "Median trip distance")
From two graphs above we can see that people give more tips with shorter median trip distance of hours in a day. Therefore we may say these two trends are negatively correlated.
First take logarithm on dataset then split these values into 8 bins, then calculate the average amount of tips in each bin.
trip_df$logdis <- cut(log10(trip_df$trip_distance),
breaks=c(-10,-3,-2,-1,0,1,2,3,10),
labels = c(1,2,3,4,5,6,7,8))
trip_df %>%
select(-ehail_fee) %>%
drop_na() %>%
group_by(logdis) %>%
summarise( ave_tip = mean(tip_amount)) %>%
ggplot() +
geom_bar( mapping = aes( x = logdis, y = ave_tip), stat='identity') +
labs( x = 'logarithm of distance', y = 'average tip',
title = 'average tip vs. log(distance)')
From figure above we can see that there’s a clear pattern between logarithm distance and tip amount. I would like to include this variable in future analyses.
trip_df <- trip_df %>% select(-ehail_fee)
There are also several observations with negative incomes, the total amount charged to passengers. Does not include cash tips, which are abnormal. This could mean that taxi company give portion or whole taxi payment back to the passenger. We could drop these values since they’re not meaningful for our analysis.
trip_df %>%
filter( 0 > total_amount) %>%
select( VendorID, passenger_count, total_amount)
trip_df_clean <- trip_df %>%
filter( 0 <= total_amount)
After dropping negative total_amount values, RatecodeID 99 are removed. Therefore we could say that rateID 99 is a money back code.
trip_df %>%
pull( RatecodeID) %>%
unique()
## [1] 1 4 5 3 2 6 99 NA
trip_df_clean <- trip_df_clean %>%
filter(lpep_dropoff_datetime != lpep_pickup_datetime)
trip_df_clean <- trip_df_clean %>%
filter(trip_distance != 0) %>%
filter(trip_distance < 1000)
trip_df_outliers <- trip_df %>%
mutate( z = ((tip_amount - mean(tip_amount)) / sd(tip_amount))) %>%
filter( abs(z) > 3) %>%
drop_na()
trip_df_clean <- trip_df %>%
mutate( z = ((tip_amount - mean(tip_amount)) / sd(tip_amount))) %>%
filter( abs(z) <= 3) %>%
drop_na()
trip_df_clean
payment_type_dict = c("Credit Card", "Cash", "No Charge", "Dispute", "Unknown", "Voided Trip")
trip_df_clean %>%
mutate( payment_type_str = sapply(trip_df_clean $payment_type,
function(i) payment_type_dict[i])) %>%
select( payment_type_str, tip_amount) %>%
drop_na() %>%
ggplot( ) +
geom_boxplot( mapping = aes( y = tip_amount, x = payment_type_str,
fill = payment_type_str)) +
labs( x = "Payment Method", y = "Tips Amount (dollors)",
title = "Boxplot of Tips Amount verses Payment Method for Taxi.")
trip_df_outliers %>%
mutate( payment_type_str = sapply(trip_df_outliers $payment_type,
function(i) payment_type_dict[i])) %>%
select( payment_type_str, tip_amount) %>%
drop_na() %>%
ggplot( ) +
geom_point( mapping = aes( y = tip_amount, x = payment_type_str)) +
labs( x = "Payment Method", y = "Tips Amount (dollors)",
title = "Boxplot of Tips Amount verses Payment Method for Taxi.")
“The correlation coefficient is a statistical measure of the strength of the relationship between the relative movements of two variables.” The correlation between two variables are stronger if the coefficient is more close to 1. In this work, I use variables that have cor > 0.1 with tips_amount for further analyses.
They are:
DOLocationID
fare_amount
extra
total_amount
payment_type
congestion_surcharge
Other variables such day of a week and hour of a day have only weak correlation with tips amount, but log(distance) will try to use for tips prediction.
tip_cor_matrix <- cor(trip_df_clean[sapply(trip_df_clean, is.numeric)])[10,]
tip_cor_matrix
## VendorID RatecodeID PULocationID
## 0.0120461289 -0.0467045432 0.0247456231
## DOLocationID passenger_count trip_distance
## 0.1390033863 0.0004155833 0.0227077125
## fare_amount extra mta_tax
## 0.2159913270 0.1234111028 0.0625053302
## tip_amount tolls_amount improvement_surcharge
## 1.0000000000 0.0522904829 0.0590977816
## total_amount payment_type trip_type
## 0.3930686211 -0.6613746487 -0.0490953511
## congestion_surcharge z
## 0.3861026460 1.0000000000
Only looking at correlations greater than 0.1:
tip_cor_matrix[abs(tip_cor_matrix) > 0.1]
## DOLocationID fare_amount extra
## 0.1390034 0.2159913 0.1234111
## tip_amount total_amount payment_type
## 1.0000000 0.3930686 -0.6613746
## congestion_surcharge z
## 0.3861026 1.0000000
Selecting variables for prediction.
trip_df_for_prediction <- trip_df_clean %>%
select(DOLocationID, fare_amount, extra, total_amount, payment_type, congestion_surcharge, logdis, tip_amount)
glimpse(trip_df_for_prediction)
## Rows: 301,623
## Columns: 8
## $ DOLocationID <dbl> 41, 74, 7, 145, 166, 41, 173, 80, 213, 58, 179, 1…
## $ fare_amount <dbl> 4.5, 5.0, 6.0, 6.0, 4.0, 6.0, 14.0, 6.0, 10.0, 10…
## $ extra <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,…
## $ total_amount <dbl> 5.80, 6.30, 9.12, 7.30, 6.36, 7.30, 15.30, 7.30, …
## $ payment_type <dbl> 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1…
## $ congestion_surcharge <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0…
## $ logdis <fct> 4, 4, 4, 5, 4, 5, 5, 4, 5, 5, 5, 5, 4, 4, 5, 5, 5…
## $ tip_amount <dbl> 0.00, 0.00, 1.82, 0.00, 1.06, 0.00, 0.00, 0.00, 0…
trip_df_for_prediction$logdis <- as.numeric(trip_df_for_prediction$logdis)
None of these variables are categorical an non-numeric, therefore no encoding needed. However, conversion from factor to numeric is needed for logdis variable.
norma <- function(x){
(x-min(x))/(max(x)-min(x))
}
trip_df_norm <- sapply(trip_df_for_prediction, norma) %>% as.data.frame()
Split 7% to training set, and 3% for testing set. There are intotal 300k+ rows in dataset, it’s too many for compiling 2+ different k in a reasonable time. First, randomly pick 10%, then pick 70% and 30% for train and test.
set.seed(7)
trip_df_for_prediction_10p <- trip_df_for_prediction[sample(1:nrow(trip_df_for_prediction),
.1*floor(nrow(trip_df_for_prediction))), ]
sample_train_idx <- sample(1:nrow(trip_df_for_prediction_10p),
.1*floor(nrow(trip_df_for_prediction_10p)))
sample_train <- trip_df_for_prediction[sample_train_idx, ]
sample_test <- trip_df_for_prediction[-sample_train_idx, ]
From last assignment.
# Inputs: train data, test data, label of train data, and k.
library(class)
MSE <- function(x){
mean(sum((x - mean(x))^2))
}
knn.predict <- function( train, test, k){
knn_pred <- knn(train, test, train$tip_amount, k)
mse_i <- MSE( as.numeric(levels(knn_pred))[knn_pred]- test$tip_amount)
return (mse_i)
}
k ranges from 1 to 25 is used for knn and append to a tibble.
kvspred <- tibble('k'=numeric(), 'mse'=numeric())
for(i in 1:25){
mse.i = knn.predict(sample_train, sample_test,2)
kvspred <- add_row(kvspred, 'k'=i, 'mse'=mse.i)
}
Plot k vs. MSE.
ggplot(data = kvspred) +
geom_line( mapping = aes(x = k, y = mse)) +
labs( title = "MSE vs. k values")
In the plot we can see that k=3 and k=25 have the smallest MSE among 25 values. KNN prediction has at most 10% improvement by using k = 3 or 25 comparing to k = 24.
However, I don’t recommend using KNN for this type of prediction. The first thing is tips amount is more like a continuous variable instead of categorical. Therefore, linear regression should be more suitable for this circumstances. Also, KNN involves too much calculation for distance between points. for our 300k+ dataset, it would take too much time for making a prediction.